\documentclass{article}
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{mathtools}
\usepackage{hyperref}
\DeclarePairedDelimiter\abs{\lvert}{\rvert}
\DeclarePairedDelimiter\norm{\lVert}{\rVert}
\DeclarePairedDelimiter\inner{\langle}{\rangle}
\def\P{\mathcal{P}}
\def\E{\mathbb{E}}
\DeclarePairedDelimiter\floor{\lfloor}{\rfloor}
\DeclarePairedDelimiter\ceil{\lceil}{\rceil}
\DeclareMathOperator*{\diag}{diag}
\DeclareMathOperator*{\argmin}{argmin}
\newtheorem{lemma}{Lemma}
\newtheorem{remark}{Remark}
\newtheorem{proposition}{Proposition}
\title{Non-linearity in random variable approximation}
\author{Feng Zhao}

\begin{document}
\maketitle
\section{Abstract}
We find that under orthogonal feature (which can be obtained by PCA techniques etc.), using Hermite orthogonal polynomial as activation function is universally good. 
\section{Background}
Suppose we have two discrete random variables $X, Y$. $Y$ is binary (0 or 1). $X$ is from $\{1, \dots, \abs{\mathcal{X}}\}$. We want to approximate $P_{Y|X=x}(0)$ (approximating $P_{Y|X=x}(0)$ is through $1-P_{Y|X=x}(1)$). We suppose the dimension $n$ of $X$ is very large. The observation of $X$ is through $k$ given feature function $f_1, \dots f_k$. 
A simple approximation of $P_{Y|X=x}(0)$ is through $ \sum_{i=1}^k w_i f_i(x)$. To increase the approximation power we introduce a non-linear transformation function $ \sigma$. That is, we have
$g(x)  = P_{Y|X=x}(0) \approx \sigma(\sum_{i=1}^k w_i f_i(x))$. This formulation is very similar to one-node neural networks. If we take $\sigma$ as sigmoid function and treat the output of neural network as
posterior probability, our loss function is $ \E[y - \sigma(\sum_{i=1}^k w_i f_i(x))]$. 

Now we have only $n$(very large) samples, the loss function to minimize globally is
$\norm{g - \sigma(\sum_{i=1}^k w_i f_i)}^2$ using L2 loss. 
$g, f_i$ is an $n\times 1 $ vector.

We want to find an activation function that is good on average. We need to formulate the problem in more detail. Since We know little about $g$ and $f$, we can treat them as uniformly distributed. $g$ might have the non-negative constraint. We can make a simple transformation (e.g. $\log$) to remove the constraint and make some normalization. All these operations are inversible. We assume that $g$ is Gaussian distributed, $f_1, \dots, f_k$ are orthogonal to each other and they are uniformly distributed on a sphere.

\section{Problem}
Let $Y$ be standard Gaussian vector. each component has $ \frac{1}{n}$ variance.
$X_1, \dots, X_k$ be
n-dimensional uniformly distributed random vector on unit sphere.
$X=(X_1, \dots, X_k)$ is an $n\times k$ random matrix
truncated from a $n\times n$ random orthogonal matrix.
$Y$ is independent with $X$.
Each sample $x_1, \dots, x_k$ from $X$ have the property that
$x_i \cdot x_j = 0$. Suppose $\norm{Y}=1$.
We would like to compute the following quantity:
\begin{equation}\label{eq:Eminw}
\E[\min_w \norm{Y - \sigma (X w ) }^2]
\end{equation}
$\sigma$ is generally a non-linear scalar function.
Its application on a vector is element-wise.
\begin{lemma}\label{lem:uniform}
Suppose $X$ is an $n$ by $k$ random matrix.
The samples $x_1, \dots, x_k$ from $X$ have the property that
$x_i \cdot x_j = 0, \norm{x_i}=1$ where $x_i$ is the $i$-th column.
We also require $X_i$ is uniformly distributed on an unit sphere.
Let $A=X X^T$, then we have
\begin{equation}
E[A_{ij}]= \begin{cases}
\frac{k}{n} & i = j\\
0 & i\neq j
\end{cases}
\end{equation}
\end{lemma}
\begin{proof}
We can use a generator model to simulate the sampling of $X$.
First we random select $x_1$
from uniform distributed random variable on an $n$ dimensional sphere.
Then $x_2$ should be selected from the $n-1$ dimensional sphere
orthogonal to $x_1$ and so on.
It is easier to show $\E[X_1X_1^T] = \frac{1}{n} I_n$
since we can write
$X_1 = \frac{(a_1, a_2, \dots a_n) }{\sqrt{a_1^2+\dots + a_n^2}}$
where $a_1, \dots a_n$ are i.i.d Gaussian.

To show $\E[A]=\frac{k}{n}I_n$ we need to show respectively
$\E[X_iX_i^T]=\frac{1}{n} I_n$.
We first show this equality holds for $i=2$.
From the generator model, $X_2$ depends on $X_1$.
By the Law of total expectation we have
$\E[X_2 X_2^T] = \E_{X_1}[\E[X_2 X_2^T |X_1 = x_1]]$.
$X_2 | x_1$ is a random variable distributed on an $n-1$ sphere.
We assume $b_1, \dots, b_{n-1}$ is an orthogonal unit basis
for this $n-1$ dimensional space,
then $b_1, \dots, b_{n-1}, x_1$ are an orthogonal unit basis
for the $n$ dimensional space. Therefore we have
$\sum_{i=1}^{n-1} b_i b_i^T = I_n -  x_1 x_1^T $.
The RHS is fixed for given $x_1$,
which is irrelevant with the choice of $b_1, \dots, b_{n-1}$.
We can show that the inner expectation
$\E[X_2 X_2^T |X_1] = \frac{1}{n-1}(I_n - x_1 x_1^T)$
since we can choose
$X_2 = \frac{1}{\sqrt{a_1^2 + \dots + a_{n-1}^2}}\sum_{i=1}^{n-1} a_i b_i$
($b_i$ is fixed vector while $a_i$ is scalar random vector).
Then taking the outer expectation we have
$\E[X_2 X_2^T] = \frac{1}{n-1} (I_n - \frac{1}{n} I_n) = \frac{1}{n} I_n$.

For $i>2$, the proof is similar as we have
$$
\E[X_i X_i^T] =
\E_{X_1, \dots, X_{i-1}} [\E[X_i X_i^T | x_1, \dots x_{i-1}]]
$$
which equals $\frac{1}{n-i+1}(I_n - \frac{i-1}{n} I_n) = \frac{1}{n} I_n$.
\end{proof}
\begin{remark}
To compute higher order moments $\E[A_{ii}^t]$.
We can approximate it by $r^t$ where $r=\frac{k}{n}$.
We require that $k$ is large and  $ t << k$.
Indeed, $A_{ii} = \sum_{j=1}^k x_{ij}^2$
\begin{align*}
\E[A^t_{ii}] & = \E[(\sum_{j=1}^k X_{ij}^2)^t] \\
\approx & k^t \E[X^2_{ij_1}X^2_{ij_2}\dots X^2_{ij_t}]
\end{align*}
Using the conclusion from Lemma \ref{lem:uniform},
we can get for example $\E[X_{11}^2 X_{12}^2]
= \E_{X_1}[X_{11}^2 \E[ X^2_{12}| X_1]]
= \frac{1}{n-1}\E[X_{11}^2 ( 1 - X_{11}^2 )] = \frac{1}{n-1}(\frac{1}{n} - \frac{3}{n(n+2)}$.
Therefore we get the result that $\E[X_{11}^2 X_{12}^2]  =  \frac{1}{n(n+2)}$
\end{remark}
\begin{remark}
Exact solution for $t=2$. The fourth order moment for each component of
uniform distribution on the sphere is $\frac{3}{n(n+2)}$. Then we have
\begin{align*}
\E[(X_{11}^2 + \dots + X_{1k}^2)^2] &=
k\frac{3}{n(n+2)} + k(k-1)\frac{1}{n-1} (\frac{1}{n} - \frac{3}{n(n+2)}) \\
&=\frac{k(k+2)}{n(n+2)}
\end{align*}
In general, we can treat $X_{11}, \dots, X_{1k}$
as k components of a uniform random vector on the sphere. And we have
\begin{equation}
\E[A_{ii}^t] = \prod_{i=0}^{t-1}\frac{k+2i}{n+2i}
\end{equation}
\end{remark}
\begin{remark}
$\E[A_{12}^2] = \frac{(n-k)k}{(n-1)n(n+2)}$
\end{remark}
\begin{lemma}\label{lem:x2y2}
Suppose $(X,Y)$ is two-dimensional Gaussian vector,
has zero mean and covariance vector $\Sigma$, then
\begin{align*}
\E[X^2 Y^2] &= \Sigma_{11}\Sigma_{22} + 2\Sigma_{12}^2 \\
\E[X^3 Y + Y^3 X ] &= 3 (\Sigma_{11} + \Sigma_{22}) \Sigma_{12} \\
\E[X^3 Y^3] & = 6 \Sigma_{12}^3 + 9 \Sigma_{12} \Sigma_{11} \Sigma_{22}\\
\E[X^4 Y^4] & = 24 \Sigma_{12}^4 +
72 \Sigma_{12}^2 \Sigma_{11} \Sigma_{22} + 9\Sigma_{11}^2 \Sigma_{22}^2
\end{align*}
For more formulas, please see Isserlis' theorem, or
\href{https://en.wikipedia.org/wiki/
Multivariate_normal_distribution#Higher_moments}{Higher moments}
in wikipedia.
\end{lemma}
\begin{remark}
Suppose $ i + j $ is even, using Isserlis's theorem we have
\begin{equation}\label{eq:Isserlis}
\E[X^i Y^j] =
\sum_{k=0, k+i \textrm{ is even}}^{\min\{i,j\}}
\frac{i! j!}{k! (i-k)!!(j-k)!!}
\Sigma_{12}^k \Sigma_{11}^{(i-k)/2}\Sigma_{22}^{(j-k)/2}
\end{equation}
\end{remark}
\begin{proof}
Let $\Sigma = L^T L $ and $\binom{x}{y} = L^T \binom{x'}{y'}$,
then $\binom{x'}{y'}$ has identity covariance.
By Cholesky decomposition we can make $L$ be upper trianglar matrix
($L_{12}=0$) with $L_{11}^2 = \Sigma_{11},
L_{11}L_{12} = \Sigma_{12},
L_{22}^2 = \Sigma_{22} - \frac{\Sigma_{12}^2}{\Sigma_{11}}$.
Then we have $x = L_{11} x', y = L_{12} x' + L_{22} y'$. Therefore
\begin{align*}
\E[X^2 Y^2] & = L_{11}^2 (L_{12}^2\E[X'^4]+ L^2_{22}\E[X'^2]\E[Y'^2]) \\
& = L_{11}^2(3L_{12}^2 + L^2_{22}) \\
& = \Sigma_{11}\Sigma_{22} + 2\Sigma_{12}^2
\end{align*}
\end{proof}
\begin{lemma}\label{lem:abcd}
Suppose $X_1, X_2, Y_1, Y_2$ are gaussian random variables and
the covariance of their join distribution is block diagonal:
$(X_1, X_2, Y_1, Y_2) \sim N(0, \binom{\Sigma_1, 0}{0, \Sigma_2})$.
Then the following equality holds:
\begin{equation}
\E[f(X_1X_2)g(Y_1Y_2)] = \E[f(X_1 X_2)] \E[g(Y_1 Y_2)]
\end{equation}
\end{lemma}
\begin{proof}
\begin{align*}
\E[f(X_1X_2)g(Y_1Y_2)] & =
\int \frac{f(x_1 x_2) g(y_1 y_2)}
{(2\pi)^2 \sqrt{\abs{\Sigma_1}\abs{\Sigma_2}}}
\exp(-\frac{1}{2}(x_1, x_2) \Sigma_1^{-1} \binom{x_1}{x_2} -
\frac{1}{2}(y_1, y_2) \Sigma_1^{-1} \binom{y_1}{y_2})dx_1 d x_2 dy_1 dy_2 \\
& = \int \frac{f(x_1 x_2)  }{(2\pi) \sqrt{\abs{\Sigma_1}}}
\exp(-\frac{1}{2}(x_1, x_2) \Sigma_1^{-1} \binom{x_1}{x_2})dx_1 dx_2  \\
&\cdot \int \frac{g(y_1 y_2)}{(2\pi) \sqrt{\abs{\Sigma_1}}}
\exp(-\frac{1}{2}(y_1, y_2) \Sigma_1^{-1} \binom{y_1}{y_2})dy_1 dy_2 \\
& = \E[f(X_1 X_2)] \E[g(Y_1 Y_2)]
\end{align*}
We can also use the property of Gaussian distribution directly:
$X=(X_1, X_2)$ and $Y=(Y_1, Y_2)$
are both joint Gaussian random vector and
these two are independent with each other from their join distribution.
Then they are uncorrelated.
\end{proof}

We assume $\sigma(z) = z + \epsilon \xi(z)$.
When $\epsilon = 0$, the optimal $w$ for given $X, Y$ is
$\bar{w} =X^T Y $.
For small $\epsilon$, we assume
$w = \bar{w} + \epsilon \hat{w} + \epsilon^2 \tilde{w}$.
Then we can expand $\norm{Y - \sigma (X w ) }^2$ as follows:
\begin{align*}
\norm{Y - \sigma (X w ) }^2 &=
\norm{Y - X \bar{w} - \epsilon X \hat{w} -
\epsilon^2 X \tilde{w} - \epsilon \xi(X \bar{w} + \epsilon X\hat{w}) }^2 \\
& = \norm{ Y - X  \bar{w}  -
\epsilon (X\hat{w} + \xi(X \bar{w} )) -
\epsilon^2(X\tilde{w} + \nabla \xi(X \bar{w} )X\hat{w})}^2 \\
& = \norm{Y-X \bar{w}  }^2 -
2 \epsilon (X \hat{w} + \xi(X \bar{w}))^T (Y-X \bar{w}) +\\
& +\epsilon^2(\norm{X\hat{w} + \xi(X \bar{w})}^2 -
2 (X \tilde{w}+ \nabla \xi(X \bar{w})X\hat{w})^T(Y-X \bar{w}))
\end{align*}
In the above formula,  we expand $\xi$ at $X \bar{w}$.
The Jacobi is actually a diagonal matrix whose $i$-th entry is
$\xi'([X\bar{w}]_i)$.
$[\cdot]_i$ represents the $i$-th element of a vector.
We first notice that for given $X$,
$X\bar{w}$ is the projection of $Y$ onto linear subspace spanned by
columns of $X$.
We use $\tilde{Y}$ to denote the mirror of $Y$ about this linear subspace.
Then we have
$XX^T Y = XX^T \tilde{Y}$ and
$(Y- XX^TY) = -(\tilde{Y} - XX^T \tilde{Y})$.
This refers to $\xi(X\bar{w})^T (Y-X\bar{w})$.
For $(X\hat{w})^T (Y-X\bar{w})$,
we use the property that $X^T X = I_k$.
By symmetric property,
the integration of coefficient of $\epsilon$ with respect to $Y$
is zero (given $X$).
For the coefficient of $\epsilon^2$ we have
\begin{equation*}
\textrm{Coeff}(\epsilon^2) =
\E[\norm{X\hat{w} + \xi(XX^TY)}^2 -
2 (\nabla \xi(XX^TY)X\hat{w})^T(Y-XX^TY)]
\end{equation*}
We have written Equation \eqref{eq:Eminw} in
$$ \norm{Y - X \bar{w}}^2 + \textrm{Coeff}(\epsilon^2)
\epsilon^2 + o(\epsilon^2)$$.

Minimizing $\E[\textrm{Coeff}(\epsilon^2)]$ we have
\begin{align*}
C[\xi] &:= \min \textrm{Coeff}(\epsilon^2) \\
&=
\E[\norm{\xi(XX^TY)}^2 -
\norm{X^T\xi(XX^TY)}^2 -
\norm{X^T \nabla\xi(XX^TY)(Y-XX^TY)}^2]
\end{align*}
The minimal value is achieved at
$$
\hat{w} =  X^T(\nabla\xi(XX^T Y)
(Y-XX^T Y) - \xi(XX^T Y))
$$
Below we assume $X$ is given
(the expectation is taken first about $Y$ and then about $X$),
the columns of $X$ are orthogonal by the given condition
($X^TX=I_k$) and
$Y_1, \dots, Y_n $ be component variable of $Y$.
The covariance matrix of $Y$ is $I_n$ assuming $Y_i$ are i.i.d. std Gaussian.
We use $I_1, I_2, I_3$ to denote
the three terms of $C[\xi]$ with given $X$. That is
$C[\xi] = \E_X[I_1 + I_2 + I_3]$.
Also $A=XX^T$ is an $n\times n$ matrix.
Let $Z = AY$. $Z | X$ is Gaussian and
its covariance matrix is
$A\E[YY^T]A^T = \frac{1}{n}AA^T = \frac{1}{n}A$
(from Lemma \ref{lem:uniform}).
Also, both $Y$ and $Z$ have zero mean (vector). The joint distribution of $(Z, X)$ is the product of Gaussian and multivariant Beta distribution \cite{eaton1989group}.
We then have
\begin{equation*}
I_1 =\E[\norm{\xi(Z)}^2] = \sum_{i=1}^n \E_{Z|X}[\xi^2(Z_i)]
\end{equation*}

For $I_2$
we have
\begin{equation*}
-I_2 = \sum_{i,j=1, i \neq j}^n A_{ij}\E[\xi(z_i)\xi(z_j)] +
\sum_{i=1}^n A_{ii}  \E[\xi^2(z_i)]
\end{equation*}
where $Z_i, Z_j$ have the joint distribution $p_{ij}$.

%$A_{ii} = \sum_{j=1}^k x^2_{ij} < 1$.
Then we have
\begin{equation}\label{eq:I1plusI2}
I_1+ I_2 = \sum_{i=1}^n (1-A_{ii}) \E[\xi^2(z_i)] -
\sum_{i,j=1, i \neq j}^n A_{ij}\E[\xi(z_i)\xi(z_j)]
\end{equation}
For $I_3$, we have
\begin{align*}
-I_3 & = \E_Y[\norm{X^T \nabla\xi(XX^TY)(Y-XX^TY)}^2] \\
& = \sum_{i,j=1, i \neq j}^n A_{ij}\Sigma_{ij} +
\sum_{i=1}^n A_{ii}\Sigma_{ii}
\end{align*}
For $\Sigma_{ii}$ we have
\begin{align}\label{eq:sigmaii}
\Sigma_{ii} &=  \E[ [(I-A)Y]_i^2 [\nabla \xi(AY)]_{i,i}^2] \\
&= \frac{1}{n}(1-A_{ii}) \E[ [\nabla \xi(AY)]_{i,i}^2]
\end{align}

For $\Sigma_{ij}$ with $i\neq j$ we have
\begin{align*}
\Sigma_{ij} &=  \E[ [(I-A)Y]_i [(I-A)Y]_j [\nabla \xi(AY)]_{i,i}
[\nabla \xi(AY)]_{j,j}]  \\
&= - \frac{1}{n}A_{ij} \E[[\nabla \xi(AY)]_{i,i} [\nabla \xi(AY)]_{j,j}]
\end{align*}

We then have
\begin{align*}
I_1+I_2+I_3 &= \sum_{i=1}^n (1-A_{ii})(\E[\xi^2(z_i)] -
\frac{1}{n} A_{ii} \E[ [\nabla \xi(AY)]_{i,i}^2]) \\
&- \sum_{i,j=1, i \neq j}^n A_{ij} (\E[\xi(z_i)\xi(z_j)] -
\frac{1}{n}A_{ij}\E[[\nabla \xi(AY)]_{i,i} [\nabla \xi(AY)]_{j,j}])
\end{align*}
By symmetric property, we can simplify the above equation as:
\begin{align}\label{formularI123}
I_1+I_2+I_3 &=n(1-A_{11})(\E[\xi^2(z_1)] -
\frac{1}{n} A_{11} \E[ [\nabla \xi(AY)]_{1,1}^2]) \\
&- n(n-1)A_{12}(\E[\xi(z_1)\xi(z_2)] -
\frac{1}{n}A_{12}\E[[\nabla \xi(AY)]_{1,1}
[\nabla \xi(AY)]_{2,2}])\nonumber
\end{align}

We consider
$\xi(z) = a_0 + a_1 z + a_2 z^2 + \dots + a_{m-1} z^{m-1} + a_m z^m $,
there will be an $(m+1) \times (m+1) $ matrix $\bar{M}$,
a vector $q = [a_0, a_1, \dots, a_m]^T$ and
we can write $I_1 + I_2 + I_3$
as the quadratic form $ q^T \bar{M} q $.
The element $\bar{M}_{ij}$ is the coefficient of $a_ia_j$.
Since $\bar{M}$ is symmetric we only consider $i\leq j$.

Using Equation \ref{formularI123}, we have
\begin{align*}
\bar{M}_{ij} &= n(1-A_{11}) (\E[Z_1^{i+j}] -
ij \frac{1}{n}A_{11} \E[Z_1^{i+j-2}])  \\
&-n(n-1)A_{12}(\E[Z_1^i Z_2^j] - ij \frac{1}{n}A_{12}\E[Z_1^{i-1}Z_2^{j-1}])
\end{align*}
We can approximate $\E[Z_1^{2t}]$ where
$Z=AY$ assuming $Y$ is a Gaussian vector with variance $\frac{1}{n}$.
Then $\E[Z_1^{2t}] = \frac{1}{n^t}A_{11}^t (2t-1)!!$.

Let $i+j=2t$. Using Equation \eqref{eq:Isserlis} we have:
\begin{align*}
\E[Z_1^i Z_2^j]  -  & ij \frac{1}{n}A_{12}\E[Z_1^{i-1}Z_2^{j-1}] =
\frac{1}{n^t} \sum_{k=0, k+i \textrm{ is even}}^{\min\{i,j\}}
\frac{i! j!}{k! (i-k)!!(j-k)!!}
A_{12}^k A_{11}^{(i-k)/2}A_{22}^{(j-k)/2} \\
- & \frac{ij}{n^t} \sum_{k=0, k+i \textrm{ is odd}}^{\min\{i-1,j-1\}}
\frac{(i-1)! (j-1)!}{k! (i-1-k)!!(j-1-k)!!}
A_{12}^{k+1} A_{11}^{(i-1-k)/2}A_{22}^{(j-1-k)/2} \\
= &\frac{1}{n^t} \sum_{k=0, k+i \textrm{ is even}}^{\min\{i,j\}}
\frac{i! j!}{k! (i-k)!!(j-k)!!}
A_{12}^k A_{11}^{(i-k)/2}A_{22}^{(j-k)/2} \\
- & \frac{1}{n^t} \sum_{k=1, k+i \textrm{ is even}}^{\min\{i,j\}}
\frac{i! j!}{(k-1)! (i-k)!!(j-k)!!}
A_{12}^{k} A_{11}^{(i-k)/2}A_{22}^{(j-k)/2} \\
= &\frac{1}{n^t} \sum_{k=0, k+i \textrm{ is even}}^{\min\{i,j\}}
\frac{(1-k)i! j!}{k! (i-k)!!(j-k)!!}A_{12}^k A_{11}^{(i-k)/2}A_{22}^{(j-k)/2} 
\end{align*}
Therefore we have
\begin{align}
\bar{M}_{ij} &= n(1-A_{11})\frac{A^t_{11}}{n^t}  ((2t-1)!!- ij (2t-3)!!)  \notag\\
&+n(n-1)\frac{1}{n^t}A_{12} \sum_{s=0, s+i \textrm{ is even}}^{\min\{i,j\}}
\frac{(s-1)i!j!}{s!(i-s)!!(j-s)!!}
A_{12}^s A_{11}^{(i - s) /2}A_{22}^{(j - s)/2} \label{eq:Mij_second}
\end{align}

For $\bar{M}_{0j}$ where $j \neq 0$,
it equals to zero if $j$ is an odd number,
let $j=2t$ we can compute that
$\bar{M}_{0,2t}=\frac{1}{n^{t-1}} (1-r)r^t (2t-1)!! $.

For $i = 1$, $ j = 2 t - 1$ we have
$\bar{M}_{ij}$ equal to zero.

For $i \neq 0$, if $i+j$ is odd then $\bar{M}_{ij} = 0$. Now we let $M_{ij} = \E_X[\bar{M}_{ij}]$ and 
$C[\xi]=q^t M q$.
Consider $i \neq j$ and $i+j = 2t$ we then have
$M_{ij} = \frac{1}{n^{t-1}} (2t-1-ij) (1-r)r^t (2t-3)!! =
-n(i-1)(j-1) (1-r)r^{(i+j)/ 2 } (i+j-3)!!$.
Finally, consider $M_{ii}$ with $i \neq 1$. we have $M_{ii} =
-\frac{1}{n^{i-1}} (i-1)^2 (1-r)r^i (2i-3)!!$.

Average over $A_{11}, A^2_{12}$(We have $\E[A^2_{12}]= (1-\frac{k-1}{n-1})\frac{k}{n(n+2)}$),
$A_{22}(\approx r = \frac{k}{n})$.
The summation must start with $s=3$.
The order of $A_{12}^4$ in
the second term of $M_{ij}$ must start with $4$.
Therefore the order of $k$ in second term is smaller than
the first term and we can neglect the second term in Equation \eqref{eq:Mij_second}
if $k$ is sufficiently large.

We can write the above formulation in mathematical way:
Suppose $S=\{s| s+i \textrm{ is even}, s\leq \min\{i,j\} \}$,
We need to compare the order of $\E[A_{11}^{t+1}]$ with the following expression:
\begin{align*}
&(n-1) \sum_{s \in S}
\frac{(s-1)i!j!}{s!(i-s)!!(j-s)!!}
\E[A_{12}^{s+1} A_{11}^{(i - s) /2}A_{22}^{(j - s)/2}]\\
&=(n-1) \max_{s\in S}\{\E[A_{12}^{s+1} A_{11}^{(i - s) /2}A_{22}^{(j - s)/2]}\}\sum_{s \in S}
\frac{(s-1)i!j!}{s!(i-s)!!(j-s)!!}
\end{align*}
Using Proposition \ref{prop:UUM}, we have 
$\sum_{s \in S}
\frac{(s-1)i!j!}{s!(i-s)!!(j-s)!!} = C(t)$. Since we suppose $t << k$, we can treat $C(t)$ as constant.
Therefore, we only need to consider $n\max_{s\in S}\{A_{12}^{s+1} A_{11}^{(i- s) /2}A_{22}^{(j- s)/2}\}$,
Since $A$ is positive definite, $A_{12}^2 \leq A_{11}A_{22}$. Therefore, the maximum value is actually bounded by
$n \E[A_{12}^4 A_{11}^{(i-3)/2} A_{22}^{(j-3)/2}]$.
Using Cauchy Inequality with $B_1 = A_{12}^2 A_{11}^{(i-3)/2}$ and $B_2 = A_{12}^2 A_{22}^{(i-3)/2}$ we have $n\E[B_1B_2] \leq n \sqrt{\E[B_1^2] \E[B_2^2]} = n  \sqrt{\E[ A_{12}^4 A_{11}^{i-3} ]\E[ A_{12}^4 A_{11}^{j-3} ]}$.
When $k, n$ are sufficiently large, by central limit theorem we can treat the joint distribution of $A_{12}, A_{11}$ as Gaussian.
The variance of $A_{12}$ is $\E[A^2_{12}]=(1-\frac{k-1}{n-1}) \frac{k}{n(n+2)}$. We assume $k/n$ is a constant and therefore the variance is $\frac{r(1-r)}{n} \leq \frac{r^2}{4n}$.
We can write $A_{11}^{i-3} = [A_{11} - \E[A_{11}] + \E[A_{11}]]^{i-3}$, $A_{11} - \E[A_{11}]$ has zero mean and variance $\frac{k(k+2)}{n(n+2)} - \frac{k^2}{n^2} = \frac{2r(1-r)}{n}$ which is one order small compared with $\E[A_{11}] = r$.
Therefore, we only need to consider the dominant term: $\E[A^4_{12}]\E[A_{11}]^{i-3}$. Since $A_{12}$ is zero mean Gaussian with variance bounded by $\frac{r^2}{4n}$.  We have $ \E[A^4_{12}]\E[A_{11}]^{i-3} \sim \frac{r^{i+1}}{n^2}$ Similarly we have $ \E[A^4_{12}]\E[A_{22}]^{j-3} \sim \frac{r^{j+1}}{n^2}$. Therefore
The non-diagonal term is bounded by $\frac{r^{(i+j)/2+1}}{n} = \frac{r^{t+1}}{n}$. Compared with $\E[A_{11}^{t+1}] = r^{t+1}$, the non-diagonal term is one order smaller and can be neglected when $n$ is large. 

The computation of $\E[A^2_{12}]$ is as follows:
we have $\E[A_{11}^2] = k\E[X_1^4] + k(k-1)\E[X_1^2 X_2^2] \Rightarrow
\frac{k(k+2)}{n(n+2)} = \frac{3k}{n(n+2)} + (k-1) (k \E[X_1^2 X_2^2])$.
Also $\E[A_{12}^2] = \E[(\sum_{i=1}^k X_1 Y_1)^2]
= k \E[X_1^2X^2_2] + k(k-1)\E[X_1X_2Y_1Y_2]
$.
Since $\sum_{i=1}^n X_i Y_i = 0$, multiplying this equation by $X_2Y_2$ and taking the expectation
we have $\E[X_1X_2Y_1Y_2] = \frac{-1}{n-1} \E[X_1^2 X_2^2]$.

Therefore $\E[A^2_{12}]=(1-\frac{k-1}{n-1}) \frac{k}{n(n+2)}$.

We summarize our deduction as follows:
\begin{equation}
M_{ij} = \begin{cases} 0 & i=1 \textrm{ or } j=1 \textrm{ or } i + j
\textrm{ is odd} \\
 -\frac{1}{n^{(i+j)/2-1}}(i-1)(j-1) (1-r)r^{(i+j)/ 2 } (i+j-3)!! & i+j
 \textrm{ is even and } i,j \neq 1 \\
(1-r)n & i=j=0
\end{cases}
\end{equation}
For example, consider $m = 4$
we can write the $ 5 \times 5 $ matrix as follows ($r'=\frac{r}{n}$):
$$
(1-r)n\begin{pmatrix}
1 & 0 & r'  & 0 & 3r'^2\\
0 & 0 & 0  & 0 & 0\\
r' &  0 & - r'^2 & 0 & -9 r'^3 \\
0 & 0 & 0 & -12r'^3 & 0 \\
3r'^2 & 0 & -9 r'^3 & 0 & -135r'^4
\end{pmatrix}
$$

Now we consider the normalization condition: $I_1 = 1$.
Let $N$ be an $(m+1) \times (m+1)$ matrix.
Then we have $q^T N q = 1$ where
$N_{ij} =\frac{1}{n^{(i+j)/2-1}}r^{(i+j) / 2} (i+j -1)!!$
when $i+j$ is even and $N_{ij} = 0$
when $i+j$ is odd.
The special case is $N_{00} = n$.
Now if we rescale the parameter:
$b_0 =  \sqrt{n}a_0, b_1 = \sqrt{r} a_1, b_2 = r \frac{1}{\sqrt{n}}a_2$
and $b_m = r^{m/2} \frac{1}{n^{(m-1)/2}}a_m$.
Let $p = [b_0, b_1, \dots, b_m]$.
Then $p^T \widetilde{N} p =1$ where
$\widetilde{N}_{ij} = \delta(i,j) (i+j -1)!!$.
Also, the object function $q^T M q$ is simplified as $q^T \widetilde{M} q$
where $\widetilde{M}_{ij} =  -(i-1)(j-1)(1-r) (i+j-3)!! $.
This transformation makes the matrix irrelevant with $n$.

First we let $U^T U = \widetilde{N}$.
We show that $U_{ij} = \delta(i,j)\frac{j!}{(j-i)!!\sqrt{i!}}$
where $U$ is an upper triangular matrix.
$\delta(i,j)$ is an indicator function:
\begin{equation}
\delta_{ij} = \frac{1}{2}(1+(-1)^{i+j})=\begin{cases}
1 & i+j \textrm{ is even} \\
0 & i+j \textrm{ is odd}
\end{cases}
\end{equation}
We can show that $\widetilde{M} = (1-r)U^T \Lambda U$ where
$\Lambda = \diag\{1,0, -1, -2, \dots, -(m-1)\}$.

Since $p^T U^T U p = 1$, and the minimal value $-(m-1)$ for
$\textrm{Coeff}(\epsilon^2) = p^T U^T \Lambda U p $ is achieved
when $U p = (0, \dots, 0, 1)^T = e_m$.
Solving $ U p = e_m $ can we get the value of $p$.
That is: $p_i = (-1)^{(m-i)/2} \frac{m!}{i!(m-i)!! \sqrt{m!}} $
when $i+m$ is even; $p_i = 0 $ when $i+m$ is odd.
Transforming back to $a_i$ we have:
\begin{equation}
a_i =(-1)^{(m-i)/2} \delta(i,m) \frac{n^{(i-1)/2}}{r^{i/2}}
\frac{m!}{i!(m-i)!! \sqrt{m!}}
\end{equation}

Suppose  $ m = 4$ we have $ \xi(z) =
\frac{1}{\sqrt{24}}(\frac{3}{\sqrt{n}} - \frac{6\sqrt{n}}{r} z^2 +
\frac{n^{3/2}}{r^2}z^4)  = \frac{1}{\sqrt{24n}}((\frac{nz^2}{r} - 3)^2 - 6)$.
In such case we have $\E[\min \norm{Y - \sigma(Xw)}^2]=(1-r)(1-3\epsilon^2)$
\section{Orthogonal Polynomial}
Let $p_m(x) = \xi(\frac{z}{\sqrt{r}})$
where the highest order term of $p_m(x)$ is $m$.
We can show that $p_m(x)$ is a series of orthogonal polynomial sequence
with respect to the inner product
$<x^i, x^j> = \delta(i,j)(i+j-1)!!$.
Indeed, if $m_1 + m_2$ is odd, $<p_{m_1}(x), p_{m_2}(x)> = 0$.
We only need to consider $m_1 + m_2$ is even. WLOG, $m_1 < m_2$,
we can pad the higher order terms of polynomial $p_{m_1}(x)$.
Let $p_{m_i}$ be the resulting coefficient of $p_{m_i}(x)$
and $<p_{m_1}(x), p_{m_2}(x)> = p_{m_1}U^TUp_{m_2} = e^T_{m_1} e_{m_2} = 0$.

We can further show that $p_m(x)$ belongs to Hermite polynomials
under some scaling.

\section{Experiment Results}
We consider $r=\frac{2}{3}, k=40, n=60, \epsilon=0.1$ and $m=2$.
The sampling time is 6000.
For linear approximation the results is $0.3338$,
which corresponds to theoretical value $\frac{1}{3}$.
If we add $\epsilon \xi_2(z)$ perturbation
$\xi(z) = \frac{1}{\sqrt{2n}}(\frac{n}{r} z^2 -1)$,
the improvement is $0.01 * \frac{1}{3}$.
The simulation shows that the MSE is $0.3317$.
If we use sigmoid function instead of second order of Hermite polynomial,
the result is $0.3923$, which is worse than linear case.

\section{Further discussion when $t$ is large}
Above we use some approximation $n, k >> 1$ and $ t << k$.
We can weaken the assumptions. That is, we only preserve $ n >> 1$.
We can compute $N_{ij} =\frac{(2t-1)!!}{n^{t-1}} \prod_{s=0}^{t-1}
\frac{(2s+k)}{(2s+n)} = n
\prod_{s=0}^{t-1} \frac{(2s+k)(2s+1)}{(2s+n)n}$ 
when $i+j = 2t$; otherwise $N_{ij}=0$.
$M_{ij}=-\frac{1-r}{n^{t-1}} \prod_{s=0}^{t-1}
\frac{(2s+k)}{(2s+n+2)} (i-1)(j-1) (i+j-3)!! =
(1-r)(i-1)(j-1)n \prod_{s=0}^{t-1} \frac{(2s+k)(2s-1)}{(2s+n+2)n}$
\section{External Link}
Source document is available at
\url{https://gitee.com/freewind201301/non-linear-activation-function/blob/
master/non_linear.tex}

\appendix
\section{Decomposition of $\widetilde{N}$}
This section provides math induction proof for $\widetilde{N}= U^T U$ where
\begin{align}
\widetilde{N}_{ij} & = \delta(i, j) (i+j-1)!! \\
U_{ij} & = \delta(i, j) \frac{j!}{(j-i)!!\sqrt{i!}} \textrm{ for } i\leq j
\end{align}
\begin{proposition}\label{prop:UUN}
\begin{equation}
\delta(i, j) (i+j-1)!! = \sum_{k=0}^{\min\{i, j\}}
\delta(k, i) \delta(k, j) \frac{i!}{(i-k)!!\sqrt{k!}}
\frac{j!}{(j-k)!!\sqrt{k!}}
\end{equation}
\end{proposition}
\begin{remark}
If Proposition \ref{prop:UUN} holds, $\widetilde{N} = U^T U$.
\end{remark}
\begin{proof}
If $ i + j $ is odd, $ \delta(i, j) = 0 $ and
$ \delta(k, i)\delta(k, j) = 0 $.
We first consider both $ i, j $ are even. In this case, we need to compute
\begin{equation}\label{eq:even}
A(i, j) = \sum_{k=0}^{\min\{i,j\}}
\frac{(2i)!(2j)!}{(2i-2k)!!(2j-2k)!!(2k)!} \textrm{ for } i, j \geq 0
\end{equation}
If both $ i, j $ are odd, we need to compute
\begin{equation}\label{eq:odd}
B(i, j) = \sum_{k=0}^{\min\{i,j\}}
\frac{(2i+1)!(2j+1)!}{(2i-2k)!!(2j-2k)!!(2k+1)!} \textrm{ for } i,j \geq 0
\end{equation}
We can see that $A(i, j) = A(j, i)$,
$B(i, j) = B(j, i)$
We have
\begin{align}
B(i, j) &= \sum_{k=0}^{\min\{i,j\}}
\frac{(2i+1)!(2j)!}{(2i-2k)!!(2j-2k-2)!!(2k)!}
\left(\frac{1}{2j-2k} + \frac{1}{2k+1}\right) \notag\\
&= (2i + 1) A(i, j) + (2j) B(i, j-1) \label{eq:Bij}
\end{align}
Above formula holds for $ i \leq j-1 $.
If $ i \geq j $, we can use the symmetric property to show it also holds.
Similarly we have
\begin{equation}\label{eq:Aij}
A(i, j) = (2j - 1) A(i, j-1) + (2i) B(i-1, j-1)
\end{equation}
We use math induction to show
\begin{align}\label{eq:UUN_conclusion}
A(i, j) & = (2i + 2j - 1)!! \\
B(i, j) & = (2i + 2j + 1)!!
\end{align}
We first show that when $i = j = 0$,
Equation \eqref{eq:UUN_conclusion} holds.
Using math induction, we assume \eqref{eq:UUN_conclusion}
holds for $ 0 \leq i, j \leq t$. When we consider
$j = t + 1$, from Equation \eqref{eq:Aij} we have
\begin{align*}
A(i, t+1) & = (2t+1) A(i, t) + (2i) B(i-1, t) \\
& = (2t+1)(2i+2t-1)!! + (2i) (2i+2t-1)!! \\
& = (2i + 2t + 1)!!
\end{align*}
This holds for $ i < t + 1 $.
Using symmetric property we have $A(t+1, j) = (2t + 2j + 1)!!$ for
$ j < t + 1$
\begin{align*}
A(t+1, t+1) & = (2t+1)A(t+1, t) + (2t+2)B(t, t) \\
& = (2t-1)(4t+1)!! + (2t+2)(4t+1)!! \\
& = (4t+3)!!
\end{align*}
From Equation \eqref{eq:Bij} we can get the expression for $B(i, j)$.
\end{proof}
\section{Decomposition of $\widetilde{M}$}
This section provides math induction proof for
$\widetilde{M}= (1-r) U^T \Lambda U$ where
\begin{align}
\widetilde{M}_{ij} & = -(1-r)\delta(i, j) (i-1)(j-1) (i+j-3)!! \\
U_{ij} & = \delta(i, j) \frac{j!}{(j-i)!!\sqrt{i!}} \textrm{ for } i\leq j \\
\Lambda_{ij} & = 1_{i = j} (1-i)
\end{align}
\begin{proposition}\label{prop:UUM}
\begin{equation}
\delta(i, j)(i-1)(j-1)(i+j-3)!! = \sum_{k=0}^{\min\{i, j\}}
\delta(k, i) \delta(k, j) \frac{(1-k)i!j!}{(i-k)!!(j-k)!!k!}
\end{equation}
\begin{proof}
\begin{align*}
RHS & = \sum_{k=0}^{\min\{i, j\}}
\delta(k, i) \delta(k, j) \frac{i!j!}{(i-k)!!(j-k)!!k!}
- \sum_{k=1}^{\min\{i, j\}}
\delta(k, i) \delta(k, j) \frac{i!j!}{(i-k)!!(j-k)!!(k-1)!} \\
& = \delta(i, j)(i+j-1)!! - ij\sum_{k=0}^{\min\{i-1, j-1\}}
\delta(k, i) \delta(k, j) \frac{(i-1)!(j-1)!}{(i-1-k)!!(j-1-k)!!k!} \\
& = \delta(i, j)(i+j-1)!! - \delta(i, j)ij(i+j-3)!! \\
& = \delta(i, j)(i-1)(j-1)(i+j-3)!!
\end{align*}
\end{proof}
\end{proposition}
\section{Proof of $Up=e_m$}
\begin{proof}
$p_i = (-1)^{(m-i)/2}\delta(i, m) \frac{m!}{i!(m-i)!! \sqrt{m!}} $.
First we can show that $U_{mm} p_m = 1 $. For $ 0\leq i < m $ we
will show that $ \sum_{j=i}^m U_{ij} p_j = 0 $. In specific form as
\begin{equation}
\sum_{j=i}^m \delta(i, j) \frac{j!}{(j-i)!!\sqrt{i!}} (-1)^{(m-j)/2}\delta(j, m) \frac{m!}{j!(m-j)!! \sqrt{m!}} = 0
\end{equation}
It is equivalent to
\begin{align*}
\sum_{j=i, j+=2}^m (-1)^{(m-j)/2} \frac{1}{(m-j)!!(j-i)!!}  = 0 & \iff \\ 
\sum_{j=0, j+=2}^{2m} (-1)^{(2m-j)/2} \frac{1}{(2m-j)!! j!!} = 0 & \iff \\
\sum_{j=0}^{m} (-1)^{m-j} \frac{1}{(2m-2j)!! (2j)!!} = 0 & \iff \\
\sum_{j=0}^{m} (-1)^{m-j} \frac{1}{(m-j)! j!} = 0 & \iff \\
\sum_{j=0}^{m} (-1)^{m-j} \binom{m}{j} = 0 & \iff \\
(1+x)^m = 0 (x=-1)
\end{align*}
\end{proof}
\section{Another proof of $-(m-1)$}
Using Lagrange multiplier we get an unconstraint optimization problem:
$p^T \widetilde{M} p - \lambda p^T \widetilde{N} p $. Let the derivative of this expression equal to zero
we have $(\widetilde{M}-\lambda \widetilde{N})p=0$ where $\lambda = p^T \widetilde{M} p$.
Therefore, we choose the minimal $\lambda$ which make the determinant of $(\widetilde{M}-\lambda \widetilde{N})$ equal to zero.
We have the following proposition:
\begin{proposition}
\begin{equation}
\det[\widetilde{M}-\lambda \widetilde{N}] = \prod_{i=0}^m i!(-(i-1) - \lambda)
\end{equation}
\end{proposition}
From the above proposition, we say that the minimal $\lambda$ is exactly $-(m-1)$.
\bibliographystyle{plain}
\bibliography{exportlist}
\end{document}
